In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
import seaborn as sns; sns.set(color_codes=True)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import sys
sys.setrecursionlimit(5000)
mpl.style.use('default')

Saving and uploading environment variables each session

In [74]:
import shelve
filename='./shelve_TAD_3dBrowser_analysis.out'
my_shelf = shelve.open(filename,'n') # 'n' for new
shelved = []
error = []
for key in dir():
    if not(key.startswith('_')):
        try:
            my_shelf[key] = globals()[key]
            shelved.append(str(key))
        except:
            error.append(str(key))
my_shelf.close()

print("## Successfully Shelved: ##")
_ = [print(str(i)) for i in shelved]
print("\n## ERROR Shelving: ##")
_ = [print(str(i)) for i in error]
## Successfully Shelved: ##
In
InteractiveShell
Out
TAD_master
TAD_master_100k
TAD_master_30k
ax
betweenTADsize
buffer
cell_type_tads
chr1_len
chromTADcorr
corr
corr100k
corr30k
createMasterTADList
diff
diff_matrix
error
filename
findTADinCellTypes
g
input_file_list
key
line
my_shelf
num_of_TADs
shelved
sum
tads_merged
total_corr
win

## ERROR Shelving: ##
csv
exit
f
get_ipython
mpl
np
pd
plt
quit
reader
shelve
sns
sys
In [ ]:
import shelve
filename='./shelve_TAD_3dBrowser_analysis.out'
my_shelf = shelve.open(filename)
opened = []
error = []
for key in my_shelf:
    try:
        globals()[key]=my_shelf[key]
        opened.append(str(key))
    except:
        error.append(str(key))
my_shelf.close()

print("## Successfully Opened: ##")
_ = [print(str(i)) for i in opened]
print("\n## ERROR Shelving: ##")
_ = [print(str(i)) for i in error]

Data from 3d_genome_browser TAD coordinates generated from the Dixon 2012 paper pipeline (http://promoter.bx.psu.edu/hi-c/publications.html) found here: /dors/capra_lab/users/bentonml/enhancer_gain_loss/data/reg_domains/tad/3d_genome_browser/

In [41]:
import csv
with open('/dors/capra_lab/users/evonne/TAD/data/3dGenomeBrowser_cellTypes_fileLocs_ordered.txt') as f:
    reader = csv.reader(f, delimiter = "\t")
    input_file_list = list(reader)
    
cell_type_tads = {}

for line in input_file_list:
    cell_type_tads[line[1]] = pd.read_table(line[0], header=None)
    cell_type_tads[line[1]].columns = ['chr','start','stop']

# access data frames in the dictionary like this: cell_type_tads['bladder'].head()

Number of TADs in each cell type

In [42]:
sum = 0
num_of_TADs = []
for key in cell_type_tads:
    print(key + ":   " + str(len(cell_type_tads[key])))
    sum = sum + len(cell_type_tads[key])
    num_of_TADs.append(len(cell_type_tads[key]))
    
    
print("\n\n\n" + str(sum) + " number of NON unique TADs over 37 unique tissue/cell types")
_ = plt.hist(num_of_TADs, bins=15)
plt.show()
aorta_leung2015:   1531
Liver_leung2015:   2066
thymus_leung2015:   984
leftVentricle_leung2015:   1652
A549_lungAdenocarcinoma_dekker:   1842
Caki2_clearCellRenalCellCarcinoma_dekker:   1901
G401_Wilms_tumor_dekker:   1929
LNCaP_prostateAdenocarcinoma_dekker:   1920
NCIH460_NSCLC_dekker:   1705
PANC1_pancreaticCarcinoma_dekker:   1744
RPMI7951_melanoma_dekker:   1669
SJCRH30_BMrhabdomyosarcoma_dekker:   1625
SKMEL5_melanoma_dekker:   2172
SKNDZ_neurblastoma_dekker:   1472
SKNMC_neuroblastoma_dekker:   1588
T470_breastCancer_dekker:   1857
adrenal_schmitt2016:   1718
bladder_schmitt2016:   1554
smallBowel_schmitt2016:   1298
cortex_DLPFC_schmitt2016:   1966
lung_schmitt2016:   1815
psoasMuscle_schmitt2016:   1333
pancreas_schmitt2016:   1383
spleen_schmitt2016:   1845
rightVentricle_schmitt2016:   1625
H1_ESC_Dixon2015:   2499
H1_mesendoderm_Dixon2015:   2479
H1_mesenchymalSC_Dixon2015:   2209
H1_neuralSC_Dixon2015:   1756
H1_trophoblastLike_Dixon2015:   2203
GM12878_lymphoblastoid_Lieberman:   3120
HMEC_humanMammaryEpithelial_Lieberman:   3235
HUVEC_Lieberman:   2842
IMR90_fetalLungFibroblast_Lieberman:   3157
K562_CML_Lieberman:   3019
KBM7_CML_Lieberman:   2879
NHEK_epidermalKeratinocytes_Lieberman:   2832



74424 number of NON unique TADs over 37 unique tissue/cell types

Some summary stats about the TADs

In [62]:
sns.set_palette(sns.husl_palette(37,l=.6))
ax = plt.subplot(111)
diff_matrix = pd.DataFrame()
for key in cell_type_tads:
    diff = pd.DataFrame(cell_type_tads[key]['stop'] - cell_type_tads[key]['start'])
    diff.columns = [key]
    ax = diff.plot.kde(xlim =(0,5000000),figsize=(10,6), title='Zoom-ed in TAD size by tissue/cell type', ax=ax)
    diff_matrix = pd.concat([diff_matrix,diff],axis=1)

ax.legend(loc='upper center', bbox_to_anchor=(1.2, 1))

plt.show()
mpl.style.use('default')
Out[62]:
<matplotlib.legend.Legend at 0x7fb08c0ec4a8>
In [44]:
g = sns.set(rc={'figure.figsize':(50,4)})
g = sns.violinplot(data=diff_matrix,figsize=(60,6))
g = plt.xticks(rotation=45)
g = plt.ylim(0,5000000)

Function to find base pair size between two TADs

In [45]:
def betweenTADsize(data_frame): # input: BED file coordinates
    
    diff = np.array([])

    for i in range(len(data_frame) - 1):
        if data_frame.loc[i + 1,'chr'] == data_frame.loc[i,'chr']:
            difference =  data_frame.loc[i + 1,'start'] - data_frame.loc[i,'stop'] 
            diff = np.append(diff, difference)
            
    return(pd.DataFrame(diff)) # returns data frame of the basepair size between TAD boundaries
In [46]:
tads_merged  = pd.DataFrame(columns = ['chr','start','stop'])

for key in cell_type_tads:
    tads_merged = pd.merge(tads_merged,cell_type_tads[key],how='outer',sort=True)
    
tads_merged = tads_merged.drop_duplicates() # there shouldnt be any duplicates but drop them if there are
print(str(len(tads_merged)) + " number of UNIQUE TADs of 37 unique tissue/cell types if given no \"buffer\" for the start/stop location")
47149 number of UNIQUE TADs of 37 unique tissue/cell types if given no "buffer" for the start/stop location

interTAD distances by tissue/cell type

In [47]:
sns.set_palette(sns.husl_palette(37,l=.6))
ax = plt.subplot(111)
diff_matrix = pd.DataFrame()
for key in cell_type_tads:
    diff = betweenTADsize(cell_type_tads[key])
    diff.columns = [key]
    ax = diff.plot.kde(xlim = (0,800000), figsize=(10,6), title='Zoom-ed in interTAD distances by tissue/cell type', ax=ax)
    diff_matrix = pd.concat([diff_matrix,diff],axis=1)
    #diff.describe()
ax.legend(loc='upper center', bbox_to_anchor=(1.2, 1))

plt.show()
mpl.style.use('default')
Out[47]:
<matplotlib.legend.Legend at 0x7fb08d16c518>
In [48]:
g = sns.set(rc={'figure.figsize':(50,4)})
g = sns.violinplot(data=diff_matrix,figsize=(60,6))
g = plt.xticks(rotation=45)
g = plt.ylim(0,500000)

Note: some of the TAD boundaries probably truely represent the same TAD boundary but start/stop at a slightly different spot

Create one master TAD list with intervals for TAD start and stop sites

In [49]:
def createMasterTADList(buffer, tads_merged):
    ## Input:
    # buffer: integer which is the window of the buffer (if you give 200KB you add 100kb and subtract 100kb to look for other TAD boundaries)
    # tads_merged: a list of all the TAD boundaries merged
    buffer = buffer*2
    tmp = tads_merged.copy(deep=True)
    TAD_master = []

    while len(tmp) != 0:
        start = tmp.loc[0,'start']
        stop = tmp.loc[0,'stop']
        chrom = tmp.loc[0,'chr']
        start_min = start - buffer
        start_max = start + buffer
        stop_min = stop - buffer
        stop_max = stop + buffer
        tmp.drop(tmp[(tmp['chr'] == chrom) & (tmp['start'] > start_min) & (tmp['start'] < start_max) & (tmp['stop'] > stop_min) & (tmp['stop'] < stop_max)].index, inplace=True)
        tmp = tmp.reset_index(drop=True)
        TAD_master.append([chrom,start_min,start_max,stop_min,stop_max])
    
    del tmp
    
    TAD_master = pd.DataFrame(np.vstack(TAD_master))
    TAD_master.columns = ['chr','start_min','start_max','stop_min','stop_max']

    TAD_master = TAD_master.apply(pd.to_numeric, errors = 'ignore')
    
    return(TAD_master)
In [50]:
# Note: this may take some time to run

buffer = 15000
TAD_master_30k = createMasterTADList(buffer,tads_merged)
print(str(len(TAD_master_30k)) + " number of UNIQUE TADs of 37 unique tissue/cell types if given a " + str(buffer) + "bp \"buffer\" for the start/stop location")

buffer = 1
TAD_master = createMasterTADList(buffer,tads_merged)
print(str(len(TAD_master)) + " number of UNIQUE TADs of 37 unique tissue/cell types if given a " + str(buffer) + "bp \"buffer\" for the start/stop location")

buffer = 50000
TAD_master_100k = createMasterTADList(buffer,tads_merged)
print(str(len(TAD_master_100k)) + " number of UNIQUE TADs of 37 unique tissue/cell types if given a " + str(buffer) + "bp \"buffer\" for the start/stop location")
38861 number of UNIQUE TADs of 37 unique tissue/cell types if given a 15000bp "buffer" for the start/stop location
47149 number of UNIQUE TADs of 37 unique tissue/cell types if given a 1bp "buffer" for the start/stop location
21242 number of UNIQUE TADs of 37 unique tissue/cell types if given a 50000bp "buffer" for the start/stop location
In [63]:
TAD_master_30k.head()
Out[63]:
chr start_min start_max stop_min stop_max aorta_leung2015 Liver_leung2015 thymus_leung2015 leftVentricle_leung2015 A549_lungAdenocarcinoma_dekker ... H1_mesenchymalSC_Dixon2015 H1_neuralSC_Dixon2015 H1_trophoblastLike_Dixon2015 GM12878_lymphoblastoid_Lieberman HMEC_humanMammaryEpithelial_Lieberman HUVEC_Lieberman IMR90_fetalLungFibroblast_Lieberman K562_CML_Lieberman KBM7_CML_Lieberman NHEK_epidermalKeratinocytes_Lieberman
0 chr1 -30000 30000 650000 710000 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 chr1 -30000 30000 1810000 1870000 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
2 chr1 -30000 30000 3330000 3390000 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
3 chr1 -30000 30000 3370000 3430000 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 chr1 -30000 30000 3570000 3630000 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0

5 rows × 42 columns

Function to run through a bed and add a column to the master TAD list, 1 represents that TAD does exist in the bed file, and 0 represents that it does not exist

In [66]:
def findTADinCellTypes(data_frame, stringOfCellTypeName, TAD_total): 
    ## Input:
    # data_frame: BED file coordinates data frame - cell type specific
    # stringOfCellTypeName: string describing the cell type name
    # TAD_total: dataframe of all merged TADs - the "master" list

    TAD_total[stringOfCellTypeName] = 0

    for i in range(len(data_frame)):
        index = TAD_total[(TAD_total['chr'] == data_frame.loc[i,'chr']) & (TAD_total['start_min'] < data_frame.loc[i,'start']) & (TAD_total['start_max'] > data_frame.loc[i,'start']) & (TAD_total['stop_min'] < data_frame.loc[i,'stop']) & (TAD_total['stop_max'] > data_frame.loc[i,'stop'])].index
        TAD_total.loc[index,stringOfCellTypeName] = 1 
    return TAD_total
In [67]:
#Note: this may take some time to run
#findTADinCellTypes(cell_type_tads['bladder'],'bladder',TAD_master).head()

for key in cell_type_tads:
    TAD_master_30k = findTADinCellTypes(cell_type_tads[key], key, TAD_master_30k)
    TAD_master = findTADinCellTypes(cell_type_tads[key], key, TAD_master)
    TAD_master_100k = findTADinCellTypes(cell_type_tads[key], key, TAD_master_100k)
    

These plots are not working because it exceeds 15000 recursions

In [ ]:
#g = sns.clustermap(TAD_master.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1), figsize=(14,14))
In [ ]:
#g = sns.clustermap(TAD_master_30k.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1), figsize=(14,14))
In [ ]:
#g = sns.clustermap(TAD_master_100k.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1), figsize=(14,14))

different clustering method:

In [68]:
# Correlation matrix between tissue types, two different ways

corr = TAD_master.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1).corr()
corr30k = TAD_master_30k.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1).corr()
corr100k = TAD_master_100k.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1).corr()

g = sns.clustermap(corr, cmap="BuPu")
g = plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) # rotating the y-axis labels
In [69]:
g = sns.clustermap(corr30k, cmap="BuPu")
g = plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) # rotating the y-axis labels
In [70]:
g = sns.clustermap(corr100k, cmap="BuPu")
g = plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) # rotating the y-axis labels

Parts that were interesting to me

Bladder + Adrenal ? Right Ventricle & Left Ventricle - good sanity check, also aorta is nearby and psoas which i would expect (muscular tissues) The lieberman samples are all close - an artifact or makes sense? Dixon data as well some of the cancers are close but some are also part of an outgroup ? (rhabdomyosarcoma)

Correlation of TADs across the chromosome

In [71]:
def chromTADcorr(chrom_str, chr_length, TAD_master, tissue_of_interest, win):
    total_corr = []
    slide = int(win/5)

    for i in range(slide,chr_length,slide):
        low = i - (win/2)
        high = i + (win/2)
        corr = TAD_master[(TAD_master['chr'] == chrom_str) & (((TAD_master['start_min'] + TAD_master['stop_max'])/2) > low) & (((TAD_master['start_min'] + TAD_master['stop_max'])/2) < high)].drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1).corr().loc[tissue_of_interest]
        total_corr.append(corr)

    total_corr = pd.DataFrame(np.vstack(total_corr))
    total_corr.columns = TAD_master.drop(['chr','start_min','start_max','stop_min','stop_max'],axis=1).columns
    return(total_corr)
In [73]:
chr1_len = 249250621
win = 5000000
total_corr = chromTADcorr('chr1', chr1_len, TAD_master_30k, 'rightVentricle_schmitt2016', win)
sns.set_palette(sns.husl_palette(37,l=.6))

ax = plt.subplot(111)
ax = total_corr.plot(figsize=(50,6),title='Chr1',ax=ax)
ax.legend(loc='upper center', bbox_to_anchor=(1, 1))
mpl.style.use('default')
Out[73]:
<matplotlib.legend.Legend at 0x7fb08d95f588>
In [ ]:
 

Next steps idea:

* Are there cell/tissue-specific genes in areas where TADs are unique for that cell/tissue type?

* Take correlation of all TAD/CTCF structure at multiple different locations

* If the structure is very conserved (high correlation) for all except 1 or 2 tissue types

* Find genes within that TAD (near that boundary) and see if they might be cell/tissue specific - could do this by looking at gene/phenotype ontology

Find ultraconserved TADs

What is ultraconserved? What is the distribution of how many tissues each TAD is found in?

In [ ]:
def findDistofConservedTADs(TAD_master, name_of_dataset, percentile):   
    unique = []
    for i in range(1,38): 
        unique.append([i,len(TAD_master[(TAD_master.iloc[:,5:].sum(axis=1) == i)])])
    plt.bar(np.array(unique)[:,0], np.array(unique)[:,1])
    plt.title(name_of_dataset)
    plt.show()

    b = np.cumsum(np.array(unique)[:,1])/np.sum(np.array(unique)[:,1]) * 100
    print("The "+ str(percentile) + "th percentile of conservation of TADs (in the dataset: " + name_of_dataset + ") is TADs that are seen in " + str(len(b[b < percentile])) + "+ tissue types.")
    return(unique)
In [ ]:
unique = findDistofConservedTADs(TAD_master, 'No window', 99)
In [ ]:
unique = findDistofConservedTADs(TAD_master_30k, '30k window', 99)
In [ ]:
unique = findDistofConservedTADs(TAD_master_100k, '100k window', 99)
unique[23:]

Export some ultraconserved regions BED files (see evernote for GREAT output from these results)

In [ ]:
# Just to start with, gives 38 regions
ultraconservedSubset = TAD_master_30k[(TAD_master_30k.iloc[:,5:].sum(axis=1) > 10)].loc[:,['chr','start_min','stop_max']]
ultraconservedSubset.to_csv('ultraconservedSubset.bed',sep='\t',index=False,header=False)

Unique TADs by tissue (see evernote for GREAT output from these results)

In [ ]:
uniqueTAD = TAD_master_100k[(TAD_master_100k.iloc[:,5:].sum(axis=1) <= 1) & (TAD_master_100k.loc[:,'rightVentricle'] == 1)].loc[:,['chr','start_max','stop_min']]

uniqueTAD.to_csv('unique_rightVentricle.bed',sep='\t',index=False,header=False)

uniqueTAD = TAD_master_100k[(TAD_master_100k.iloc[:,5:].sum(axis=1) <= 1) & (TAD_master_100k.loc[:,'H1_ESC_Dixon2015'] == 1)].loc[:,['chr','start_max','stop_min']]

uniqueTAD.to_csv('unique_H1_ESC_Dixon2015.bed',sep='\t',index=False,header=False)